Fall 2019 (191)
Student: Faris Hijazi s201578750
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import utils
import dp_kmeans
%matplotlib inline
epsilon = 0.1 # privacy budget E
k = 5
data_dir = './data/amzn-anon-access-samples-history-2.0.csv'
#data_dir = './data/amzn-anon-access-samples-2.0.csv'
data_dir = './data/ipums.csv'
data_raw = pd.read_csv(data_dir)
print('shape:', data_raw.shape)
print('count NaNs:\n', data_raw.isna().sum())
data_raw.head()
data_raw.describe()
# trimming the data
data_raw = data_raw.loc[:2000,:]
data_raw.shape
# normalizing the values
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler as StdSc
pca = PCA().fit(data_raw)
# data = pd.DataFrame(StandardScaler().fit_transform(data_raw), columns=data_raw.columns)
data = pd.DataFrame(StdSc().fit_transform(pca.transform(data_raw)), columns=data_raw.columns)
data.head()
data.describe()
Choosing a subset of features ['Age', 'Gender', 'Marital', 'Race status', 'Birth place', 'Language', 'Occupation', 'Income (K)']
pd.plotting.scatter_matrix(data, figsize=(25, 25));
| Raw Data | PCA |
|---|---|
![]() |
![]() |
df = data.loc[:, ['Birth place', 'Language', 'Income (K)']]
pd.plotting.scatter_matrix(df, figsize=(14, 14));
# normal kmeans clustering
from sklearn.cluster import KMeans
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider
def regular_kmeans(k=k):
clf = KMeans(n_clusters=k, random_state=0).fit(df.values)
centroids, labels = clf.cluster_centers_, clf.labels_
print('regular kmeans')
utils.kmeans_matrix(df.values, k, centroids, labels, chunk_size=3, columns=df.columns)
return centroids, labels
interact_manual(regular_kmeans, k=IntSlider(min=2, max=20, step=1, value=k))
This is done following the bellow diagram:

k clusterssum of points, and the num (number of records)centroids $centroid = \frac{sum}{num}$sum and num: $noise = Lap(\frac{\delta{f}}{ \sum_kt })$Basic definition. contour coefficients is a way of evaluating clustering results. The combination of cohesion and resolution can be used to evaluate the effects of different algorithms or clustering results of different operation modes based on the same original data. As for the same sample point i, the contour coefficient calculation formula is as follows:
(4)
In the formula:
ai represents the average similarity between sample i and other samples in the same cluster.
The smaller ai is, more sample i should be clustered.
bi represents the minimum value of the average distance from i to all samples from other clusters.
That is to say, bi = min{bi1,bi2,…,bik}. The contour coefficient is in [–1,1].
The larger S(i) is, the closer the cluster where the point i locates is. So the average contour coefficient for each cluster is calculated as follows:
(5)
In the formula, numk stands for the number of samples in cluster No. k. The larger the S(k) value, the better the clustering effect and vice versa.
n_clusters=3
X = np.arange(9)
np.array([k for k in range(n_clusters)] * int(np.ceil(len(X) / n_clusters)), dtype=int)[:len(X)]
from importlib import reload; reload(utils); reload(dp_kmeans);
# dp kmeans
n_clusters = k
eps = 0.5
## generating data
from sklearn.datasets import make_blobs
X_blobs, y_blobs = make_blobs(n_samples=1000, centers=n_clusters, n_features=2, random_state=12)
print('X_blobs.shape', X_blobs.shape)
print('DP Kmeans eps=', eps)
centroids_dp, labels_dp = dp_kmeans.kmeans(df, eps=eps, n_clusters=n_clusters, plot=True,
seed=None, verbose=False, plot_every=None)
# regular kmeans
clf = KMeans(n_clusters=n_clusters, random_state=0).fit(df.values)
centroids_reg, labels_reg = clf.cluster_centers_, clf.labels_
print('Regular Kmeans')
utils.kmeans_matrix(df, n_clusters, centroids_reg, labels_reg, chunk_size=3);
utils.kmeans_matrix_compare(df, n_clusters, centroids_reg, labels_reg, centroids_dp, labels_dp, chunk_size=3);
utils.kmeans_matrix_compare(df, n_clusters, centroids_reg, labels_reg, centroids_dp, labels_dp, chunk_size=2);
centroids_dp, labels_dp = dp_kmeans.kmeans(data, eps=0.1, n_clusters=n_clusters, MAX_LOOPS=100, seed=0)
print('centroids:', centroids_dp.shape)
clf = KMeans(n_clusters=n_clusters, random_state=0).fit(data)
centroids_reg, labels_reg = clf.cluster_centers_, clf.labels_
utils.kmeans_matrix(data, n_clusters, centroids_reg, labels_reg, chunk_size=3);
We compare the results between the normal kmeans and the DP kmeans, the bellow plot shows which centroids moved where. This technique guesses the corresponding centroids by pairing the closest ones.
# showing the changes in centroids
utils.kmeans_matrix_compare(data, n_clusters, centroids_reg, labels_reg, centroids_dp, labels_dp, chunk_size=3);
utils.kmeans_matrix_compare(data, n_clusters, centroids_reg, labels_reg, centroids_dp, labels_dp);
Clustering is an unsupervised problem, and we can't measure the performance if we don't have the ground truth values.
However some metrics can still be used even if the true labels are unkown, such as the Davies Bouldin score.
(Lower is better)
from sklearn import metrics
print('regular kmeans davies_bouldin_score:', metrics.davies_bouldin_score(data.values, labels_reg))
print('dp kmeans davies_bouldin_score:', metrics.davies_bouldin_score(data.values, labels_dp))
Another way to get by this problem, is to generate data with known labels, just for measuring performance. Notice that we need to choose the same number of centers that we will be passing to the kmeans (in this case 5)
I made sure to choose a seed where the clusters aren't intersecting too much, random_state=12 seemed fine.
from sklearn.datasets import make_blobs
n_features = 3
n_clusters = 5
X_blobs, y_blobs = make_blobs(n_samples=5000, centers=n_clusters, n_features=n_features, random_state=9)
print(X_blobs.shape, y_blobs.shape)
labels_true = y_blobs
## plotting
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
colors = ['r', 'g', 'b', 'y', 'c', 'm']
for k_ in range(n_clusters):
points = np.array([X_blobs[j] for j in range(len(X_blobs)) if y_blobs[j]==k_])
ax.scatter(*points.T, s=7, c=colors[k_])
clf = KMeans(n_clusters=n_clusters, random_state=0).fit(X_blobs)
centroids_reg2, labels_reg2 = clf.cluster_centers_, clf.labels_
#executed in 40ms
from importlib import reload; reload(utils); reload(dp_kmeans);
centroids_dp2, labels_dp2 = dp_kmeans.kmeans(X_blobs, eps=0.5, n_clusters=n_clusters, STOP_THRESHOLD=0.001, MAX_LOOPS=1000, seed=42)
print('adjusted rand scores')
print('regular kmeans:\t', metrics.adjusted_rand_score(labels_true, labels_reg2)*100,'%')
print('dp kmeans:\t', metrics.adjusted_rand_score(labels_true, labels_dp2)*100,'%')
print('v_measure_score')
print('regular kmeans:\t', metrics.v_measure_score(labels_true, labels_reg2)*100,'%')
print('dp kmeans:\t', metrics.v_measure_score(labels_true, labels_dp2)*100,'%')